In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

 

# Display settings

pd.set_option('display.max_columns', None)

sns.set(style="whitegrid")
In [2]:
# Replace with your dataset path

df = pd.read_csv(r'C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv', encoding='latin1')

df.head()
Out[2]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City State Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
2 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles California 90036 West OFF-LA-10000240 Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714
3 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale Florida 33311 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
4 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale Florida 33311 South OFF-ST-10000760 Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164
In [5]:
# Check missing values

print("Missing values per column:\n", df.isnull().sum())
Missing values per column:
 Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64
In [7]:
# Fill numeric missing values with median

#df.fillna(df.median(), inplace=True)

df.fillna(df.median(numeric_only=True), inplace=True)
In [9]:
# Drop duplicates if any

df.drop_duplicates(inplace=True)
In [11]:
# Data types

print("Data types:\n", df.dtypes)
Data types:
 Row ID             int64
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code        int64
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
Quantity           int64
Discount         float64
Profit           float64
dtype: object
In [ ]:
# Summary statistics

df.describe()
Out[ ]:
Row ID Postal Code Sales Quantity Discount Profit
count 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000
mean 4997.500000 55190.379428 229.858001 3.789574 0.156203 28.656896
std 2885.163629 32063.693350 623.245101 2.225110 0.206452 234.260108
min 1.000000 1040.000000 0.444000 1.000000 0.000000 -6599.978000
25% 2499.250000 23223.000000 17.280000 2.000000 0.000000 1.728750
50% 4997.500000 56430.500000 54.490000 3.000000 0.200000 8.666500
75% 7495.750000 90008.000000 209.940000 5.000000 0.200000 29.364000
max 9994.000000 99301.000000 22638.480000 14.000000 0.800000 8399.976000
In [27]:
# Example target variable 'Outcome' (0 = No Diabetes, 1 = Diabetes)

sns.countplot(x='Profit', data=df)

plt.title("Emade Store Data Analytics")

plt.show()
No description has been provided for this image
In [15]:
# Histogram for Age

sns.histplot(df['Sales'], kde=True, bins=20)

plt.title("Sales Data")

plt.show()
No description has been provided for this image
In [17]:
# Boxplot for Glucose

sns.boxplot(x=df['Quantity'])

plt.title("Quantity Spread")

plt.show()
No description has been provided for this image
In [19]:
# Glucose vs Outcome

sns.boxplot(x='Quantity', y='Discount', data=df)

plt.title("Quantity vs Discount Outcome")

plt.show()
No description has been provided for this image
In [21]:
# Age vs BMI colored by Outcome

plt.figure(figsize=(8,6))

sns.scatterplot(x='Quantity', y='Discount', hue='Sales', data=df, alpha=0.7)

plt.title("Quantity vs Discount")

plt.show()
No description has been provided for this image
In [29]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(8,6))

sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")

plt.show()
No description has been provided for this image
In [31]:
# Install if not already installed

!pip install ydata-profiling

 

from ydata_profiling import ProfileReport

 

# Generate profiling report

profile = ProfileReport(df, title="Healthcare Data Profiling Report", explorative=True)

 

# Save to HTML

profile.to_file("healthcare_profile_report.html")

 

print("✅ Profiling report generated: healthcare_profile_report.html")
Requirement already satisfied: ydata-profiling in c:\users\b3stu\anaconda3\lib\site-packages (4.16.1)
Requirement already satisfied: scipy<1.16,>=1.4.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.13.1)
Requirement already satisfied: pandas!=1.4.0,<3.0,>1.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.2.2)
Requirement already satisfied: matplotlib<=3.10,>=3.5 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (3.9.2)
Requirement already satisfied: pydantic>=2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.8.2)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (6.0.1)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (3.1.4)
Requirement already satisfied: visions<0.8.2,>=0.7.5 in c:\users\b3stu\anaconda3\lib\site-packages (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (0.8.1)
Requirement already satisfied: numpy<2.2,>=1.16.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.26.4)
Requirement already satisfied: htmlmin==0.1.12 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.1.12)
Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.12.5)
Requirement already satisfied: requests<3,>=2.24.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.32.3)
Requirement already satisfied: tqdm<5,>=4.48.2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.66.5)
Requirement already satisfied: seaborn<0.14,>=0.10.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.13.2)
Requirement already satisfied: multimethod<2,>=1.4 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.12)
Requirement already satisfied: statsmodels<1,>=0.13.2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.14.2)
Requirement already satisfied: typeguard<5,>=3 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.4.4)
Requirement already satisfied: imagehash==4.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.3.1)
Requirement already satisfied: wordcloud>=1.9.3 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.9.4)
Requirement already satisfied: dacite>=1.8 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.9.2)
Requirement already satisfied: numba<=0.61,>=0.56.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.60.0)
Requirement already satisfied: PyWavelets in c:\users\b3stu\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (1.7.0)
Requirement already satisfied: pillow in c:\users\b3stu\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (10.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\b3stu\anaconda3\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.1.3)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (24.1)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (2.9.0.post0)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in c:\users\b3stu\anaconda3\lib\site-packages (from numba<=0.61,>=0.56.0->ydata-profiling) (0.43.0)
Requirement already satisfied: pytz>=2020.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pandas!=1.4.0,<3.0,>1.1->ydata-profiling) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\b3stu\anaconda3\lib\site-packages (from pandas!=1.4.0,<3.0,>1.1->ydata-profiling) (2023.3)
Requirement already satisfied: joblib>=0.14.1 in c:\users\b3stu\anaconda3\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.4.2)
Requirement already satisfied: annotated-types>=0.4.0 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (0.6.0)
Requirement already satisfied: pydantic-core==2.20.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (2.20.1)
Requirement already satisfied: typing-extensions>=4.6.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (4.15.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (2024.12.14)
Requirement already satisfied: patsy>=0.5.6 in c:\users\b3stu\anaconda3\lib\site-packages (from statsmodels<1,>=0.13.2->ydata-profiling) (0.5.6)
Requirement already satisfied: colorama in c:\users\b3stu\anaconda3\lib\site-packages (from tqdm<5,>=4.48.2->ydata-profiling) (0.4.6)
Requirement already satisfied: attrs>=19.3.0 in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (23.1.0)
Requirement already satisfied: networkx>=2.4 in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (3.3)
Requirement already satisfied: puremagic in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (1.30)
Requirement already satisfied: six in c:\users\b3stu\anaconda3\lib\site-packages (from patsy>=0.5.6->statsmodels<1,>=0.13.2->ydata-profiling) (1.16.0)
Upgrade to ydata-sdk

Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/21 [00:00<?, ?it/s]
 10%|▉         | 2/21 [00:00<00:02,  6.80it/s]
 29%|██▊       | 6/21 [00:00<00:01, 13.54it/s]
 48%|████▊     | 10/21 [00:00<00:00, 17.84it/s]
 62%|██████▏   | 13/21 [00:00<00:00, 19.35it/s]
100%|██████████| 21/21 [00:00<00:00, 22.44it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]
✅ Profiling report generated: healthcare_profile_report.html
In [33]:
profile = ProfileReport(df, title="Emade Store Data Profiling Report", explorative=True)
In [35]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/21 [00:00<?, ?it/s]
 10%|▉         | 2/21 [00:00<00:02,  6.78it/s]
 29%|██▊       | 6/21 [00:00<00:01, 11.82it/s]
 48%|████▊     | 10/21 [00:00<00:00, 16.85it/s]
100%|██████████| 21/21 [00:00<00:00, 22.61it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[35]:

In [36]:
import pandas as pd

 

# 1. Basic summary statistics

summary = df.describe(include='all').transpose()

 

# 2. Missing values

missing = df.isnull().sum().reset_index()

missing.columns = ['Column', 'MissingValues']

 

# 3. Correlation matrix

correlation = df.corr(numeric_only=True)

 

# 4. Export all to Excel

with pd.ExcelWriter("healthcare_eda_summary.xlsx") as writer:

    summary.to_excel(writer, sheet_name="Summary Stats")

    missing.to_excel(writer, sheet_name="Missing Values", index=False)

    correlation.to_excel(writer, sheet_name="Correlations")

 

print("✅ EDA results saved to healthcare_eda_summary.xlsx")
✅ EDA results saved to healthcare_eda_summary.xlsx
In [49]:
import pandas as pd
import os

# Load your healthcare data with explicit encoding
# Adding encoding parameter to handle non-UTF-8 characters
df = pd.read_csv(r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv", 
                 encoding='latin1')  # Try latin1 encoding which is more permissive
# Alternative encodings to try: 'cp1252', 'ISO-8859-1', etc.

# 1. Summary statistics
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"

# 2. Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"

# 3. Correlation matrix (flattened)
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"

# Combine all
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)

# Save to folder path
output_path = "output/healthcare_eda_results.csv"
os.makedirs("output", exist_ok=True)
combined.to_csv(output_path, index=False, encoding='utf-8')  # Specify encoding for output

print(f"✅ File saved at: {output_path}")
✅ File saved at: output/healthcare_eda_results.csv
In [ ]:
r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv"
In [51]:
import pandas as pd

import os

import shutil

import zipfile

from datetime import datetime

 

# ========== CONFIG ==========

BASE_DIR = "output"

date_str = datetime.today().strftime("%Y-%m-%d")

DAILY_DIR = os.path.join(BASE_DIR, date_str)   # Folder per day

OUTPUT_FILE = os.path.join(DAILY_DIR, f"healthcare_eda_results_{date_str}.csv")

ARCHIVE_DIR = os.path.join(BASE_DIR, "archive")
In [53]:
import pandas as pd

import os

from datetime import datetime

 

# ========== CONFIG ==========

OUTPUT_DIR = "output"

date_str = datetime.today().strftime("%Y-%m-%d")

OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"healthcare_eda_results_{date_str}.csv")

 

# Example EDA export

df = pd.read_csv("healthcare_data.csv")

 

summary = df.describe(include='all').transpose().reset_index()

summary['Section'] = "Summary Stats"

 

missing = df.isnull().sum().reset_index()

missing.columns = ['index', 'MissingValues']

missing['Section'] = "Missing Values"

 

corr = df.corr(numeric_only=True).stack().reset_index()

corr.columns = ['Var1', 'Var2', 'Correlation']

corr['Section'] = "Correlations"

 

combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)

 

os.makedirs(OUTPUT_DIR, exist_ok=True)

combined.to_csv(OUTPUT_FILE, index=False)

 

print(f"✅ File saved at {OUTPUT_FILE}")
✅ File saved at output\healthcare_eda_results_2025-08-31.csv
In [55]:
# ========== CREATE NEW DAILY FOLDER ==========

os.makedirs(DAILY_DIR, exist_ok=True)

 

# ========== SAMPLE DATA ==========

df = pd.read_csv("healthcare_data.csv")

 

# Summary

summary = df.describe(include='all').transpose().reset_index()

summary['Section'] = "Summary Stats"

 

# Missing values

missing = df.isnull().sum().reset_index()

missing.columns = ['index', 'MissingValues']

missing['Section'] = "Missing Values"

 

# Correlations

corr = df.corr(numeric_only=True).stack().reset_index()

corr.columns = ['Var1', 'Var2', 'Correlation']

corr['Section'] = "Correlations"

 

# Combine all

combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)

 

# Save to new folder

combined.to_csv(OUTPUT_FILE, index=False)

 

print(f"✅ File saved at {OUTPUT_FILE}")

print(f"📦 Previous folders zipped into {ARCHIVE_DIR}")
✅ File saved at output\healthcare_eda_results_2025-08-31.csv
📦 Previous folders zipped into output\archive
In [59]:
import pandas as pd

 

# Instead of reading from a non-existent file, let's create a sample DataFrame

df = pd.DataFrame({

    'Postal Code': [25, 30, 45, 60, 35, 42],

    'Sales': [120, 130, 140, 150, 125, 135],

    'Quantity': [200, 220, 240, 260, 210, 230],

    'Discount': [85, 90, 110, 130, 95, 105],

    'Profit': [70, 75, 80, 85, 72, 78]

})

# Create the summary DataFrames

numeric_summary = df.describe().T  # Transpose for better Excel format

missing_values = pd.DataFrame({

    'Column': df.columns,

    'Missing Values': df.isnull().sum().values,

    'Percentage': (df.isnull().sum() / len(df) * 100).values

})

correlations = df.select_dtypes(include=['number']).corr()
In [61]:
# Now proceed with your Excel export code

if not numeric_summary.empty and not missing_values.empty and not correlations.empty:

    with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:

        numeric_summary.to_excel(writer, sheet_name="Variable Summary")

        missing_values.to_excel(writer, sheet_name="Missing Values", index=False)

        correlations.to_excel(writer, sheet_name="Correlations")

    print("✅ Full EDA report saved to healthcare_eda_full.xlsx")
✅ Full EDA report saved to healthcare_eda_full.xlsx
In [65]:
# Remove the standalone 'else' statement and just keep the code block
# Create a default sheet if any DataFrame is empty
with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:
    # Write non-empty DataFrames
    if not numeric_summary.empty:
        numeric_summary.to_excel(writer, sheet_name="Variable Summary")
    
    if not missing_values.empty:
        missing_values.to_excel(writer, sheet_name="Missing Values", index=False)
    
    if not correlations.empty:
        correlations.to_excel(writer, sheet_name="Correlations")
    
    # Ensure at least one sheet exists
    pd.DataFrame({'Note': ['No data available']}).to_excel(writer, sheet_name="Info")

print("✅ Full EDA report saved to healthcare_eda_full.xlsx (some sheets may be empty)")
✅ Full EDA report saved to healthcare_eda_full.xlsx (some sheets may be empty)
In [67]:
with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:

    numeric_summary.to_excel(writer, sheet_name="Variable Summary")

    missing_values.to_excel(writer, sheet_name="Missing Values", index=False)

    correlations.to_excel(writer, sheet_name="Correlations")

 

print("✅ Full EDA report saved to healthcare_eda_full.xlsx")
✅ Full EDA report saved to healthcare_eda_full.xlsx
In [77]:
import pandas as pd

import os

import shutil

from datetime import datetime

 

# ========== CONFIG ==========

BASE_DIR = "output"

date_str = datetime.today().strftime("%Y-%m-%d")

DAILY_DIR = os.path.join(BASE_DIR, date_str)   # Folder per day

OUTPUT_FILE = os.path.join(DAILY_DIR, f"Emadestore{date_str}.csv")

 

# ========== ARCHIVE PREVIOUS ==========

if os.path.exists(BASE_DIR):

    for f in os.listdir(BASE_DIR):

        f_path = os.path.join(BASE_DIR, f)

        if os.path.isfile(f_path):  # move old flat files to archive

            archive_dir = os.path.join(BASE_DIR, "archive")

            os.makedirs(archive_dir, exist_ok=True)

            shutil.move(f_path, os.path.join(archive_dir, f))
In [79]:
# ========== CREATE NEW DAILY FOLDER ==========

os.makedirs(DAILY_DIR, exist_ok=True)

# ========== SAMPLE DATA ==========

# Modified to handle encoding issues by specifying encoding and error handling
df = pd.read_csv(r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv", 
                 encoding='latin1')  # Try 'latin1' or 'ISO-8859-1' encoding instead of default UTF-8

# Summary
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"

# Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"

# Correlations
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"

# Combine all
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)

# Save to new folder with explicit encoding
combined.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')

print(f"✅ File saved at {OUTPUT_FILE}")
✅ File saved at output\2025-08-31\Emadestore2025-08-31.csv
In [ ]: